ftp.mactech.com 2010

home *** CD-ROM | disk | FTP | other *** search

/ ftp.mactech.com 2010 / ftp.mactech.com.tar / ftp.mactech.com / machack / Hacks97 / NewsTicker.sit / NewsTicker / source code / Extractors / HTMLExtractor.cp < prev next >

Wrap

Text File | 1997-06-27 | 12KB | 599 lines

/*------------------------------------------------------------------------------ # # NewsTicker, my Hack for 1997 # # HTMLExtractor.cp - Base class to read an HTML page in, and parse # out the interesting stuff. Useless on its own, # only exists to be derived.. # ------------------------------------------------------------------------------*/ #include <Threads.h> #include <strings.h> #include "HTMLExtractor.h" #include "SubWooferEndPoint.h" #include "HTTPEndPoint.h" #include "Idler.h" #include "TickerGlobals.h" //get our structures and all #include "TickerWindowHandler.h" #include "BeachBall.h" #include <string.h> BeachBall* gTheBall = nil; class TickerIdler : public Idler { private: long mlWNEDelay; HTMLExtractor* mftheExtractor; protected: TickerIdler (const TickerIdler& oRHS); TickerIdler& operator= (const TickerIdler& oRHS); public: TickerIdler (HTMLExtractor* theExtractor); virtual void YieldAction (void); virtual ~TickerIdler (void) { } }; // // The tickler below does many things while waiting for data to send/receive from // the Internet. It spins a beachball, and recognizes as we come to/from the background, // and it scrolls our window bellow us. It also recognizes command-period or closing the // window to abort a read/write // #define kDefaultWNEDelay 15 #define kDefaultIdlerPeriod 15 TickerIdler::TickerIdler (HTMLExtractor* theExtractor) //default constructor. Tell the Idler base class how often to call yield action { SetPeriod (kDefaultWNEDelay); mlWNEDelay = kDefaultIdlerPeriod; mftheExtractor = theExtractor; } void TickerIdler::YieldAction (void) { EventRecord sEvent; JustHandleWindow(); #ifdef USESUBWOOFER YieldToAnyThread(); #endif if ((gTheBall!=nil)&&(!gInBackground)) gTheBall->Idle(); //spin our beach ball cursor if (!gDoneFlag) { if (mftheExtractor) mftheExtractor->Cancel(); } if (WaitNextEvent(everyEvent , &sEvent, mlWNEDelay, nil)) { switch (sEvent.what) { case kHighLevelEvent: AEProcessAppleEvent( &sEvent ) ; break; case keyDown: if (((sEvent.message & charCodeMask)=='.')&&(sEvent.modifiers & cmdKey)) { if (mftheExtractor) mftheExtractor->Cancel(); } break; case osEvt: if (((sEvent.message >> 24) & 0x0FF) == kSuspendResumeMessage) /* high byte of message */ { gInBackground = (sEvent.message & kResumeMask) == 0; } } } } HTMLExtractor::HTMLExtractor (char* theaddress, short theIconID, sMyDataPtr theDataPtr) { #ifdef USESUBWOOFER mfWebPipe = nil; #else mfHTTPPipe = nil; #endif mfDoingARead = false; mfTheDataPtr = theDataPtr; mfLastModified[0] = 0; mfIconID = theIconID; strcpy(mfAddress, theaddress); } HTMLExtractor::~HTMLExtractor (void) { #ifdef USESUBWOOFER if (mfWebPipe) { delete mfWebPipe; mfWebPipe = nil; } #else if (mfHTTPPipe) { delete mfHTTPPipe; mfHTTPPipe = nil; } #endif } void HTMLExtractor::AddEntry(Str255 theSubject, Str255 theURL) { if (gThePrefs.JustShowFirstThree&&(mfTempHeadlineCount>=3)) //demo mode { mfDoingARead = false; return; } if (mfTempHeadlineCount<tempmaxHeadlines) { PLstrcpy(mfTempHeadlines[mfTempHeadlineCount].Subject, theSubject); PLstrcpy(mfTempHeadlines[mfTempHeadlineCount].URL, theURL); mfTempHeadlines[mfTempHeadlineCount].cicnResID = mfIconID; mfTempHeadlineCount++; } } // Called by base app to read all entries in, or check header and see if it's changed void HTMLExtractor::ReadEntries (void) { short index; short destindex; TickerIdler* theidler = new TickerIdler(this); Ptr thebuffer; long buffersize; OSErr io; mfDoingARead = true; mfReadingHeader = true; thetextsize = 0; thetagsize = 0; AmOnTag = false; mfTempHeadlineCount = 0; if (!gTheBall) gTheBall = new BeachBall(); // Use the subwoofer code #ifdef USESUBWOOFER mfReadingHeader = false; //we don't get headers from Subwoofer if (mfWebPipe) { delete mfWebPipe; mfWebPipe = nil; } mfWebPipe = new SubWooferEndPoint(this); if (mfWebPipe->StartGettingFile(mfAddress, 80, theidler)!=noErr) { delete mfWebPipe; mfWebPipe = nil; delete theidler; return; } do { mfWebPipe->DoIdle(); theidler->YieldAction(); } while (mfDoingARead); io = mfWebPipe->GetSubWoofHeader(mfLastModified); delete mfWebPipe; mfWebPipe = nil; #else // // Use the raw OT stuff if (mfHTTPPipe) { delete mfHTTPPipe; mfHTTPPipe = nil; } mfHTTPPipe = new HTTPEndPoint(this); if (mfHTTPPipe->StartGettingFile(mfAddress, 80, theidler)!=noErr) { delete mfHTTPPipe; mfHTTPPipe = nil; delete theidler; return; } do { mfHTTPPipe->DoIdle(); theidler->YieldAction(); } while (mfDoingARead); delete mfHTTPPipe; mfHTTPPipe = nil; #endif delete theidler; // Delete all entries with cicnResID = mfIconID destindex = 0; for (index = 0; index < mfTheDataPtr->MsgCount; index++) { if (mfTheDataPtr->theHeadlines[index].cicnResID!=mfIconID) //don't delete it { if (index!=destindex) //copy down if we need to { mfTheDataPtr->theHeadlines[destindex] = mfTheDataPtr->theHeadlines[index]; } destindex++; } } mfTheDataPtr->MsgCount = destindex; // Now copy the entries we accumulated out for (index = 0; index<mfTempHeadlineCount; index++) // copy the entries off { if (mfTheDataPtr->MsgCount<maxHeadlines) { mfTheDataPtr->theHeadlines[mfTheDataPtr->MsgCount] = mfTempHeadlines[index]; mfTheDataPtr->MsgCount++; } } } // Called by endpoint as it gets strings void HTMLExtractor::ReceiveString (char* string, short numchars) { short index; char thechar; if (mfReadingHeader) { if (numchars <= 2) //must be crlf mfReadingHeader = false; else { //if Last-modifed line, save it if (MyCompareStr(string, "Last-Modified:")) { if (numchars>31) numchars = 31; mfLastModified[0] = numchars; BlockMove(string, &mfLastModified[1], numchars); } } } else { for (index = 0; index<numchars; index++) { thechar = string[index]; if ((thechar==0x0d)||(thechar==0x0a)||(thechar==0x09))//make carriage returns and line feeds spaces thechar = ' '; if (AmOnTag) { if ((thetagsize<2047)&&((thetagsize>0)||(thechar!=' '))) //add this character to the tag { thetag[thetagsize] = thechar; thetagsize++; } if (thechar=='>') //end of tag? { thetag[thetagsize] = 0; //make it a nice C string HandleToken(thetag, thetagsize, true); //and handle it thetextsize = 0; //And star getting text AmOnTag = false; } } else { if (thechar=='<') //start of tag? { if (thetextsize>0) //any text to handle? { thetext[thetextsize] = 0; HandleToken(thetext, thetextsize, false); //handle the text } thetag[0] = thechar; //put this in the tag and start parsing it thetagsize = 1; AmOnTag = true; } else //nope, just add to the text { if ((thetextsize<2047)&&((thetextsize>0)||(thechar!=' '))) { thetext[thetextsize] = thechar; thetextsize++; } } } } } } void HTMLExtractor::HandleToken(char* string, short numchars, Boolean isCommand) { } void HTMLExtractor::Disconnect(void) { mfDoingARead = false; } // Cancel the connection // void HTMLExtractor::Cancel(void) { mfDoingARead = false; } // Called by base app to read the header in void HTMLExtractor::ReadLastModified(void) { TickerIdler* theidler = new TickerIdler(this); Ptr thebuffer; long buffersize; OSErr io; mfDoingARead = true; mfReadingHeader = true; thetextsize = 0; thetagsize = 0; AmOnTag = false; mfTempHeadlineCount = 0; if (!gTheBall) gTheBall = new BeachBall(); // Use the subwoofer code #ifdef USESUBWOOFER if (mfWebPipe) { delete mfWebPipe; mfWebPipe = nil; } mfWebPipe = new SubWooferEndPoint(this); if (mfWebPipe->StartGettingHeader(mfAddress, 80, theidler)!=noErr) { delete mfWebPipe; mfWebPipe = nil; delete theidler; return; } do { mfWebPipe->DoIdle(); theidler->YieldAction(); } while (mfDoingARead); io = mfWebPipe->GetSubWoofHeader(mfLastModified); delete mfWebPipe; mfWebPipe = nil; #else // // Use the raw OT stuff if (mfHTTPPipe) { delete mfHTTPPipe; mfHTTPPipe = nil; } mfHTTPPipe = new HTTPEndPoint(this); if (mfHTTPPipe->StartGettingHeader(mfAddress, 80, theidler)!=noErr) { delete mfHTTPPipe; mfHTTPPipe = nil; delete theidler; return; } do { mfHTTPPipe->DoIdle(); theidler->YieldAction(); } while (mfDoingARead); delete mfHTTPPipe; mfHTTPPipe = nil; #endif delete theidler; } void HTMLExtractor::GetLastModified (Str31 LastModStr) { PLstrcpy(LastModStr, mfLastModified); } // // Here is some standard code to help parse the HTML // static char* SkipWhiteChars(char* pcSrc) { while ((*pcSrc != 0) && ((*pcSrc== ' ') || (*pcSrc == '\r') || (*pcSrc == '\n'))) pcSrc++; return pcSrc; } static char* SkipWhiteCharsAndEqual(char* pcSrc) { pcSrc = SkipWhiteChars(pcSrc); if (*pcSrc == '=') pcSrc++; pcSrc = SkipWhiteChars(pcSrc); return pcSrc; } Boolean MyCompareStr(char* p1, char* p2) { short thelength = strlen(p2); return (IdenticalText (p1, p2, thelength, thelength, nil)==0); } // // Look for some quoted data for a given marker // void FindATag(char* tag, char* theLink, char* theMarker) { char* cp; cp = theLink; *cp = 0; do { if (*tag != ' ') return; tag++; //DebugStr("\pPreparing to get the tag"); tag = ::SkipWhiteChars(tag); if (::MyCompareStr(tag, theMarker)) { tag += sizeof(theMarker); tag = ::SkipWhiteCharsAndEqual(tag); if (*tag != '"') return; tag++; if (*tag == '#') //A navigation on same page link return; while ((*tag != 0) && (*tag != '"')) { if (*tag=='?') //restart, this was apple funkiness { cp = theLink; tag++; } else if (*tag=='$') //another part of funkiness, this isn't a good link { *theLink = 0; return; } else *(cp++) = *(tag++); } *cp = 0; //mark the end return; } else { do //Skip this item. Get past the marker { tag++; } while ((*tag!=0) && (*tag!='=')); tag++; //skip the = if (*tag=='"') { tag++; do //Skip the quoted data { tag++; } while ((*tag!=0) && (*tag!='"')); } do //Skip the data, waiting for a space { tag++; } while ((*tag!=0) && (*tag!=' ') && (*tag!='>')); } } while ((*tag!='>')&&(*tag!=0)); } static void SaveHRef(char* tag, char* HTMLLink) { char* cp; //See if it's A HREF=" cp = HTMLLink; *cp = 0; if (*tag != ' ') return; tag++; tag = ::SkipWhiteChars(tag); if (!::MyCompareStr(tag, "HREF")) return; tag += 4; tag = ::SkipWhiteCharsAndEqual(tag); if (*tag != '"') return; tag++; if (*tag == '#') //A navigation on same page link return; while ((*tag != 0) && (*tag != '"')) { if (*tag=='?') //restart, this was apple funkiness { cp = HTMLLink; tag++; } else *(cp++) = *(tag++); } *cp = 0; //mark the end } static Boolean isFullURL(char* theURL) { for ( ; *theURL != 0; theURL++) if (*theURL == ':') return true; return false; } Boolean HTMLExtractor::ParseGoodURL(char* thestring, Str255 theURL) { char HTMLLink[256]; char headerstr[10] = "http://"; short index; theURL[0] = 0; //SaveHRef(thestring, HTMLLink); FindATag(thestring, HTMLLink, "HREF"); if ((HTMLLink[0]==0)||(HTMLLink[0] == '#')) return false; if (!isFullURL(HTMLLink)) { for (index = 0; headerstr[index]!=0; index++) //http:// { theURL[0]++;theURL[theURL[0]] = headerstr[index]; } for (index = 0; mfAddress[index]!=0; index++) //add our address to it { theURL[0]++;theURL[theURL[0]] = mfAddress[index]; } theURL[0]++;theURL[theURL[0]] = '/'; } index = 0; if (HTMLLink[0] == '/') index++; for ( ; HTMLLink[index] != 0; index++) { theURL[0]++;theURL[theURL[0]] = HTMLLink[index]; } return true; }